--- title: "Combining 'Traditional' and Text-Based Models to Board Game Ratings" author: Brendan Graham date: '2022-01-26' slug: boardgames categories: - tidy tuesday - tidymodels - data science tags: - tidy tuesday - tidymodels - data science subtitle: summary: 'This post looks at a past [TidyTuesday](https://github.com/rfordatascience/tidytuesday) data set about board game ratings. After looking at the data I attempt to predict avereage board game' featured: no image: caption: '' focal_point: '' preview_only: no projects: [] draft: false ---

Last week I tried out a text based model for the first time. This week I want to continue working with a text based model, but supplement the text data with other non-text predictors. The goal will be to use the board game category (text data) and other non-text data to predict the average board game rating.

ratings %>%
  select(-c(url, thumbnail)) %>%
  add_table()
skimr::skim_to_list(ratings)

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name 0 1 1 107 0 21432 0
url 0 1 16 68 0 21831 0
thumbnail 6 1 135 139 0 21816 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
num 0 1 10915.00 6302.21 0.00 5457.50 10915.00 16372.50 21830.00 ▇▇▇▇▇
id 0 1 118144.78 105369.55 1.00 12308.50 104994.00 207219.00 350992.00 ▇▂▃▃▂
year 0 1 1987.44 193.51 0.00 2001.00 2011.00 2017.00 3500.00 ▁▁▇▁▁
rank 0 1 10916.00 6302.21 1.00 5458.50 10916.00 16373.50 21831.00 ▇▇▇▇▇
average 0 1 6.42 0.93 1.04 5.83 6.45 7.04 9.57 ▁▁▅▇▁
bayes_average 0 1 5.68 0.36 0.00 5.51 5.54 5.67 8.51 ▁▁▁▇▁
users_rated 0 1 866.96 3679.82 30.00 56.00 122.00 392.00 108975.00 ▇▁▁▁▁
details %>%
  add_table()
skimr::skim_to_list(details)

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
primary 0 1.00 1 107 0 21236 0
description 1 1.00 49 16144 0 21615 0
boardgamecategory 283 0.99 8 216 0 6730 0
boardgamemechanic 1590 0.93 8 478 0 8291 0
boardgamefamily 3761 0.83 13 2768 0 11285 0
boardgameexpansion 16125 0.25 7 18150 0 5264 0
boardgameimplementation 16769 0.22 6 890 0 4247 0
boardgamedesigner 596 0.97 7 332 0 9136 0
boardgameartist 5907 0.73 6 8408 0 9080 0
boardgamepublisher 1 1.00 6 3744 0 11265 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
num 0 1 10815.00 6244.48 0 5407.5 10815 16222.5 21630 ▇▇▇▇▇
id 0 1 118133.09 105310.42 1 12280.5 105187 207013.0 350992 ▇▂▃▃▂
yearpublished 0 1 1986.09 210.04 -3500 2001.0 2011 2017.0 2023 ▁▁▁▁▇
minplayers 0 1 2.01 0.69 0 2.0 2 2.0 10 ▇▁▁▁▁
maxplayers 0 1 5.71 15.10 0 4.0 4 6.0 999 ▇▁▁▁▁
playingtime 0 1 90.51 534.83 0 25.0 45 90.0 60000 ▇▁▁▁▁
minplaytime 0 1 63.65 447.21 0 20.0 30 60.0 60000 ▇▁▁▁▁
maxplaytime 0 1 90.51 534.83 0 25.0 45 90.0 60000 ▇▁▁▁▁
minage 0 1 9.61 3.64 0 8.0 10 12.0 25 ▂▇▆▁▁
owned 0 1 1487.92 5395.08 0 150.0 322 903.5 168364 ▇▁▁▁▁
trading 0 1 43.59 102.41 0 5.0 13 38.0 2508 ▇▁▁▁▁
wanting 0 1 42.03 117.94 0 3.0 9 29.0 2011 ▇▁▁▁▁
wishing 0 1 233.66 800.66 0 14.0 39 131.0 19325 ▇▁▁▁▁

Explore

There are 200 games missing descriptions:

ratings %>%
  anti_join(., details, by = c("id")) %>%
  nrow()
## [1] 200
n_games <- 
  ratings %>% 
  distinct(name) %>% 
  nrow()

overall_avg <- 
  ratings %>% 
  summarise(mean_rating = mean(average, na.rm = T)) %>%
  pull(mean_rating)

ratings %>%
  ggplot(aes(x = average)) + 
  geom_histogram(alpha = .75, fill = bg_green) + 
  bg_theme() + 
  geom_vline(aes(xintercept = mean(average)), linetype = 'dashed') + 
  scale_y_continuous(expand = c(0, 25), breaks = seq(0, 3000, 100)) + 
  labs(x = "rating", y = '', title =  paste("rating distribution of", format(n_games, big.mark = ','),
                                            "games"),
       subtitle = paste("overall avg rating:", round(overall_avg, 2)))

Here we combine the ratings data with the details and token-ize the category column into individual rows per category descriptor:

combined <- 
  ratings %>%
  inner_join(., details %>% select(-num), by = c("id")) %>%
  select(-c(url, rank, bayes_average, users_rated, yearpublished, thumbnail, primary,
            boardgameexpansion, boardgameimplementation, boardgamedesigner, description,
            boardgamepublisher,boardgameartist, boardgamefamily, boardgamemechanic,
            owned, trading, wanting, wishing, minplaytime, maxplaytime, year)) %>%
  select(num, id, name, average, boardgamecategory, everything()) %>%
  rename(category = boardgamecategory) %>%
  mutate(category = str_remove_all(category, "'"),
         category = str_remove_all(category, '"'),
         category = str_replace_all(category, "\\[|\\]", ""),
         category = trimws(category)) %>%
  filter(!(is.na(category)))

tidy_category <-
  combined %>%
  unnest_tokens(word, category)

tidy_category %>%
  count(word, sort = TRUE)
## # A tibble: 113 × 2
##    word         n
##    <chr>    <int>
##  1 game     10882
##  2 card      6402
##  3 wargame   3820
##  4 fantasy   2681
##  5 war       1996
##  6 party     1968
##  7 dice      1847
##  8 fiction   1666
##  9 science   1666
## 10 fighting  1658
## # … with 103 more rows

then we check the relationship between the numeric columns with the average:

get_scatter <- 
  function(data, var){
    
    data %>%
      select(average, one_of({{var}})) %>%
      rename(variable = 2) %>%
      ggplot(., aes(x = average, y = variable)) +
      geom_jitter(alpha = .3) + 
      geom_smooth(se = F) + 
      bg_theme() +
      labs(x = "avg rating", y = var)
    
  }

numeric_cols <- 
  combined %>%
  select(where(is.numeric)) %>%
  select(-c(id, num, average)) %>%
  names

purrr::map(numeric_cols, ~get_scatter(combined, .x))
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

Model

to prep the text data for modeling we can create a matrix with each words TF-IDf value. From the step_tfidf() documentation:

Term frequency-inverse document frequency is the product of two statistics: the term frequency (TF) and the inverse document frequency (IDF).

Term frequency measures how many times each token appears in each observation.

Inverse document frequency is a measure of how informative a word is, e.g., how common or rare the word is across all the observations. If a word >appears in all the observations it might not give that much insight, but if it only appears in some it might help differentiate between observations.

The IDF is defined as follows: idf = log(1 + (# documents in the corpus) / (# documents where the term appears))

text_prep <- 
  recipe(average ~ num + id + name + category, data = combined) %>%
  update_role(num, id, name, new_role = "ID variables") %>%
  step_tokenize(category) %>%
  # step_tokenfilter(category) %>%
  step_tfidf(category) %>%
  prep() %>%
  bake(new_data = NULL)

text_prep
## # A tibble: 21,348 × 117
##      num     id name  average tfidf_category_… tfidf_category_… tfidf_category_…
##    <dbl>  <dbl> <fct>   <dbl>            <dbl>            <dbl>            <dbl>
##  1   105  30549 Pand…    7.59                0                0                0
##  2   189    822 Carc…    7.42                0                0                0
##  3   428     13 Catan    7.14                0                0                0
##  4    72  68448 7 Wo…    7.74                0                0                0
##  5   103  36218 Domi…    7.61                0                0                0
##  6   191   9209 Tick…    7.41                0                0                0
##  7   100 178900 Code…    7.6                 0                0                0
##  8     3 167791 Terr…    8.42                0                0                0
##  9    15 173346 7 Wo…    8.11                0                0                0
## 10    35  31260 Agri…    7.93                0                0                0
## # … with 21,338 more rows, and 110 more variables:
## #   tfidf_category_adventure <dbl>, tfidf_category_age <dbl>,
## #   tfidf_category_agents <dbl>, tfidf_category_american <dbl>,
## #   tfidf_category_ancient <dbl>, tfidf_category_and <dbl>,
## #   tfidf_category_animals <dbl>, tfidf_category_arabian <dbl>,
## #   tfidf_category_aviation <dbl>, tfidf_category_base <dbl>,
## #   tfidf_category_based <dbl>, tfidf_category_bluffing <dbl>, …
model_data <- 
  combined %>%
  select(-category) %>%
  left_join(., text_prep %>% select(-c(num, name, average)), "id") 

model_data
## # A tibble: 21,348 × 121
##      num     id name            average minplayers maxplayers playingtime minage
##    <dbl>  <dbl> <chr>             <dbl>      <dbl>      <dbl>       <dbl>  <dbl>
##  1   105  30549 Pandemic           7.59          2          4          45      8
##  2   189    822 Carcassonne        7.42          2          5          45      7
##  3   428     13 Catan              7.14          3          4         120     10
##  4    72  68448 7 Wonders          7.74          2          7          30     10
##  5   103  36218 Dominion           7.61          2          4          30     13
##  6   191   9209 Ticket to Ride     7.41          2          5          60      8
##  7   100 178900 Codenames          7.6           2          8          15     14
##  8     3 167791 Terraforming M…    8.42          1          5         120     12
##  9    15 173346 7 Wonders Duel     8.11          2          2          30     10
## 10    35  31260 Agricola           7.93          1          5         150     12
## # … with 21,338 more rows, and 113 more variables:
## #   tfidf_category_abstract <dbl>, tfidf_category_action <dbl>,
## #   tfidf_category_adult <dbl>, tfidf_category_adventure <dbl>,
## #   tfidf_category_age <dbl>, tfidf_category_agents <dbl>,
## #   tfidf_category_american <dbl>, tfidf_category_ancient <dbl>,
## #   tfidf_category_and <dbl>, tfidf_category_animals <dbl>,
## #   tfidf_category_arabian <dbl>, tfidf_category_aviation <dbl>, …

Then we prep for modelling by creating splits, resamples, model specifications, recipes (several to compare), and the workflowset:

set.seed(113)
splits <- 
  model_data %>%
  initial_split(strata = average)

train <-
  training(splits)
test <-
  testing(splits)

folds <- 
  vfold_cv(train, strata = average)
recipe_all <- 
  recipe(average ~ ., data = train) %>%
  update_role(num, id, name, new_role = "ID variables") %>%
  step_normalize(minplayers, maxplayers, playingtime, minage)

recipe_text <- 
  recipe_all %>%
  step_rm(minplayers, maxplayers, playingtime, minage)

recipe_no_text <- 
  recipe_all %>%
  step_rm(starts_with('tfidf'))

recipe_all %>% 
  prep() %>%
  bake(new_data = NULL)
## # A tibble: 16,009 × 121
##      num    id name    minplayers maxplayers playingtime minage tfidf_category_…
##    <dbl> <dbl> <fct>        <dbl>      <dbl>       <dbl>  <dbl>            <dbl>
##  1 20904   181 Risk       -0.0106     0.0161     0.0460   0.106                0
##  2 21823  1406 Monopo…    -0.0106     0.137      0.146   -0.445                0
##  3 21710  2223 UNO        -0.0106     0.257     -0.104   -0.995                0
##  4  8934  1294 Clue       -0.0106     0.0161    -0.0786  -0.445                0
##  5  5281    74 Apples…     2.89       0.257     -0.104    0.656                0
##  6  9239   258 Fluxx      -0.0106     0.0161    -0.104   -0.445                0
##  7  6682  2471 Zombie…    -0.0106     0.0161    -0.00383  1.48                 0
##  8 21631  2243 Yahtzee    -0.0106     0.257     -0.104   -0.995                0
##  9 10522  2452 Jenga      -1.46       0.137     -0.120   -0.995                0
## 10 21822  2921 The Ga…    -0.0106     0.0161    -0.0537  -0.445                0
## # … with 15,999 more rows, and 113 more variables: tfidf_category_action <dbl>,
## #   tfidf_category_adult <dbl>, tfidf_category_adventure <dbl>,
## #   tfidf_category_age <dbl>, tfidf_category_agents <dbl>,
## #   tfidf_category_american <dbl>, tfidf_category_ancient <dbl>,
## #   tfidf_category_and <dbl>, tfidf_category_animals <dbl>,
## #   tfidf_category_arabian <dbl>, tfidf_category_aviation <dbl>,
## #   tfidf_category_base <dbl>, tfidf_category_based <dbl>, …
recipe_text %>% 
  prep() %>%
  bake(new_data = NULL)
## # A tibble: 16,009 × 117
##      num    id name      tfidf_category_abs… tfidf_category_a… tfidf_category_a…
##    <dbl> <dbl> <fct>                   <dbl>             <dbl>             <dbl>
##  1 20904   181 Risk                        0             0                     0
##  2 21823  1406 Monopoly                    0             0                     0
##  3 21710  2223 UNO                         0             0                     0
##  4  8934  1294 Clue                        0             0                     0
##  5  5281    74 Apples t…                   0             0                     0
##  6  9239   258 Fluxx                       0             0                     0
##  7  6682  2471 Zombies!…                   0             0                     0
##  8 21631  2243 Yahtzee                     0             0                     0
##  9 10522  2452 Jenga                       0             0.756                 0
## 10 21822  2921 The Game…                   0             0                     0
## # … with 15,999 more rows, and 111 more variables:
## #   tfidf_category_adventure <dbl>, tfidf_category_age <dbl>,
## #   tfidf_category_agents <dbl>, tfidf_category_american <dbl>,
## #   tfidf_category_ancient <dbl>, tfidf_category_and <dbl>,
## #   tfidf_category_animals <dbl>, tfidf_category_arabian <dbl>,
## #   tfidf_category_aviation <dbl>, tfidf_category_base <dbl>,
## #   tfidf_category_based <dbl>, tfidf_category_bluffing <dbl>, …
recipe_no_text %>% 
  prep() %>%
  bake(new_data = NULL)
## # A tibble: 16,009 × 8
##      num    id name             minplayers maxplayers playingtime minage average
##    <dbl> <dbl> <fct>                 <dbl>      <dbl>       <dbl>  <dbl>   <dbl>
##  1 20904   181 Risk                -0.0106     0.0161     0.0460   0.106    5.59
##  2 21823  1406 Monopoly            -0.0106     0.137      0.146   -0.445    4.37
##  3 21710  2223 UNO                 -0.0106     0.257     -0.104   -0.995    5.41
##  4  8934  1294 Clue                -0.0106     0.0161    -0.0786  -0.445    5.67
##  5  5281    74 Apples to Apples     2.89       0.257     -0.104    0.656    5.79
##  6  9239   258 Fluxx               -0.0106     0.0161    -0.104   -0.445    5.67
##  7  6682  2471 Zombies!!!          -0.0106     0.0161    -0.00383  1.48     5.8 
##  8 21631  2243 Yahtzee             -0.0106     0.257     -0.104   -0.995    5.41
##  9 10522  2452 Jenga               -1.46       0.137     -0.120   -0.995    5.62
## 10 21822  2921 The Game of Life    -0.0106     0.0161    -0.0537  -0.445    4.31
## # … with 15,999 more rows
svm_spec <-
  svm_linear(cost = tune(),
             margin = tune()
  ) %>%
  set_mode("regression") %>%
  set_engine("LiblineaR")

lasso_spec <- 
  parsnip::linear_reg(penalty = tune(), 
                      mixture = 1) %>%
  set_engine("glmnet")

mars_spec <- 
  parsnip::mars(num_terms = tune(),
                prod_degree = tune()) %>%
  set_mode('regression') %>%
  set_engine("earth")

workflows <- 
  workflow_set(
    preproc = list(recipe_all = recipe_all,
                   recipe_text = recipe_text,
                   recipe_numeric = recipe_no_text), 
    models = list(svm = svm_spec,
                  lasso = lasso_spec,
                  mars = mars_spec
    ),
    cross = TRUE)

workflows
## # A workflow set/tibble: 9 × 4
##   wflow_id             info             option    result    
##   <chr>                <list>           <list>    <list>    
## 1 recipe_all_svm       <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 recipe_all_lasso     <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 recipe_all_mars      <tibble [1 × 4]> <opts[0]> <list [0]>
## 4 recipe_text_svm      <tibble [1 × 4]> <opts[0]> <list [0]>
## 5 recipe_text_lasso    <tibble [1 × 4]> <opts[0]> <list [0]>
## 6 recipe_text_mars     <tibble [1 × 4]> <opts[0]> <list [0]>
## 7 recipe_numeric_svm   <tibble [1 × 4]> <opts[0]> <list [0]>
## 8 recipe_numeric_lasso <tibble [1 × 4]> <opts[0]> <list [0]>
## 9 recipe_numeric_mars  <tibble [1 × 4]> <opts[0]> <list [0]>

Here we compare the model performance. It looks like the lasso models with the full set of predictors performs the best (circles) and the models without the category text performed the worst (triangles)

grid_ctrl <-
  control_grid(
    save_pred = TRUE,
    allow_par = TRUE,
    parallel_over = "everything",
    verbose = TRUE
  )

cl <- 
  makeCluster(10)

doParallel::registerDoParallel(cl)

results <- 
  workflow_map(fn = "tune_grid",
               object = workflows,
               seed = 155,
               verbose = TRUE,
               control = grid_ctrl,
               grid = 10, 
               resamples = folds,
               metrics = metric_set(rmse, mae)
  )

stopCluster(cl)

rank_results(results, select_best = T) %>% 
  mutate(model = ifelse(str_detect(wflow_id, "lasso"), "lasso", model),
         recipe_type = case_when(
           str_detect(wflow_id, "text") ~ "text_recipe",
           str_detect(wflow_id, "all") ~ "full recipe",
           TRUE ~ "no_text_recipe")) %>%
  filter(.metric == 'rmse') %>%
  ggplot(.,aes(x = rank,  y = mean, color = model, shape = recipe_type)) +
  geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err)) + 
  geom_point(size = 3, alpha = .75) + 
  labs(title = "Model Performance Across Recipes", subtitle = "metric: RMSE") + 
  bg_theme() + 
  ggsci::scale_color_npg()

The predicted vs actual performance of the “best” model is not very good. But again, the goal of this post was to combine text data and non-text data into a model.

best_results <- 
   results %>% 
   extract_workflow_set_result("recipe_all_lasso") %>% 
   select_best(metric = "rmse")

best_results
## # A tibble: 1 × 2
##   penalty .config              
##     <dbl> <fct>                
## 1 0.00350 Preprocessor1_Model08
cl <- 
  makeCluster(10)

test_results <- 
   results %>% 
   extract_workflow("recipe_all_lasso") %>% 
   finalize_workflow(best_results) %>% 
   last_fit(split = splits)

collect_metrics(test_results)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <fct>               
## 1 rmse    standard       0.822 Preprocessor1_Model1
## 2 rsq     standard       0.215 Preprocessor1_Model1
stopCluster(cl)

test_results %>% 
  collect_predictions() %>% 
  ggplot(aes(x = average, y = .pred)) + 
  geom_abline(col = "#e64b35", lty = 2) + 
  geom_point(alpha = 0.35, color = "#00a087") + 
  coord_obs_pred() + 
  labs(x = "observed", y = "predicted") + 
  bg_theme() + 
  ggsci::scale_color_npg()